@@ -161,78 +161,46 @@ module Agents |
||
| 161 | 161 |
log "Storing new result for '#{name}': #{doc.inspect}"
|
| 162 | 162 |
create_event :payload => doc |
| 163 | 163 |
end |
| 164 |
- else |
|
| 165 |
- output = {}
|
|
| 166 |
- interpolated['extract'].each do |name, extraction_details| |
|
| 167 |
- case extraction_type |
|
| 168 |
- when "text" |
|
| 169 |
- regexp = Regexp.new(extraction_details['regexp']) |
|
| 170 |
- result = [] |
|
| 171 |
- doc.scan(regexp) {
|
|
| 172 |
- result << Regexp.last_match[extraction_details['index']] |
|
| 173 |
- } |
|
| 174 |
- log "Extracting #{extraction_type} at #{regexp}: #{result}"
|
|
| 175 |
- when "json" |
|
| 176 |
- result = Utils.values_at(doc, extraction_details['path']) |
|
| 177 |
- log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
|
|
| 178 |
- else |
|
| 179 |
- case |
|
| 180 |
- when css = extraction_details['css'] |
|
| 181 |
- nodes = doc.css(css) |
|
| 182 |
- when xpath = extraction_details['xpath'] |
|
| 183 |
- doc.remove_namespaces! # ignore xmlns, useful when parsing atom feeds |
|
| 184 |
- nodes = doc.xpath(xpath) |
|
| 185 |
- else |
|
| 186 |
- error '"css" or "xpath" is required for HTML or XML extraction' |
|
| 187 |
- return |
|
| 188 |
- end |
|
| 189 |
- case nodes |
|
| 190 |
- when Nokogiri::XML::NodeSet |
|
| 191 |
- result = nodes.map { |node|
|
|
| 192 |
- case value = node.xpath(extraction_details['value']) |
|
| 193 |
- when Float |
|
| 194 |
- # Node#xpath() returns any numeric value as float; |
|
| 195 |
- # convert it to integer as appropriate. |
|
| 196 |
- value = value.to_i if value.to_i == value |
|
| 197 |
- end |
|
| 198 |
- value.to_s |
|
| 199 |
- } |
|
| 200 |
- else |
|
| 201 |
- error "The result of HTML/XML extraction was not a NodeSet" |
|
| 202 |
- return |
|
| 203 |
- end |
|
| 204 |
- log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
|
|
| 205 |
- end |
|
| 206 |
- output[name] = result |
|
| 164 |
+ next |
|
| 165 |
+ end |
|
| 166 |
+ |
|
| 167 |
+ output = |
|
| 168 |
+ case extraction_type |
|
| 169 |
+ when 'json' |
|
| 170 |
+ extract_json(doc) |
|
| 171 |
+ when 'text' |
|
| 172 |
+ extract_text(doc) |
|
| 173 |
+ else |
|
| 174 |
+ extract_xml(doc) |
|
| 207 | 175 |
end |
| 208 | 176 |
|
| 209 |
- num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
|
|
| 177 |
+ num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq
|
|
| 210 | 178 |
|
| 211 |
- if num_unique_lengths.length != 1 |
|
| 212 |
- error "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
|
|
| 213 |
- return |
|
| 214 |
- end |
|
| 179 |
+ if num_unique_lengths.length != 1 |
|
| 180 |
+ raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
|
|
| 181 |
+ end |
|
| 215 | 182 |
|
| 216 |
- old_events = previous_payloads num_unique_lengths.first |
|
| 217 |
- num_unique_lengths.first.times do |index| |
|
| 218 |
- result = {}
|
|
| 219 |
- interpolated['extract'].keys.each do |name| |
|
| 220 |
- result[name] = output[name][index] |
|
| 221 |
- if name.to_s == 'url' |
|
| 222 |
- result[name] = (response.env[:url] + result[name]).to_s |
|
| 223 |
- end |
|
| 183 |
+ old_events = previous_payloads num_unique_lengths.first |
|
| 184 |
+ num_unique_lengths.first.times do |index| |
|
| 185 |
+ result = {}
|
|
| 186 |
+ interpolated['extract'].keys.each do |name| |
|
| 187 |
+ result[name] = output[name][index] |
|
| 188 |
+ if name.to_s == 'url' |
|
| 189 |
+ result[name] = (response.env[:url] + result[name]).to_s |
|
| 224 | 190 |
end |
| 191 |
+ end |
|
| 225 | 192 |
|
| 226 |
- if store_payload!(old_events, result) |
|
| 227 |
- log "Storing new parsed result for '#{name}': #{result.inspect}"
|
|
| 228 |
- create_event :payload => result |
|
| 229 |
- end |
|
| 193 |
+ if store_payload!(old_events, result) |
|
| 194 |
+ log "Storing new parsed result for '#{name}': #{result.inspect}"
|
|
| 195 |
+ create_event :payload => result |
|
| 230 | 196 |
end |
| 231 | 197 |
end |
| 232 | 198 |
else |
| 233 |
- error "Failed: #{response.inspect}"
|
|
| 199 |
+ raise "Failed: #{response.inspect}"
|
|
| 234 | 200 |
end |
| 235 | 201 |
end |
| 202 |
+ rescue => e |
|
| 203 |
+ error e.message |
|
| 236 | 204 |
end |
| 237 | 205 |
|
| 238 | 206 |
def receive(incoming_events) |
@@ -266,7 +234,7 @@ module Agents |
||
| 266 | 234 |
old_event.expires_at = new_event_expiration_date |
| 267 | 235 |
old_event.save! |
| 268 | 236 |
return false |
| 269 |
- end |
|
| 237 |
+ end |
|
| 270 | 238 |
end |
| 271 | 239 |
return true |
| 272 | 240 |
end |
@@ -305,27 +273,81 @@ module Agents |
||
| 305 | 273 |
end).to_s |
| 306 | 274 |
end |
| 307 | 275 |
|
| 276 |
+ def extract_each(doc, &block) |
|
| 277 |
+ interpolated['extract'].each_with_object({}) { |(name, extraction_details), output|
|
|
| 278 |
+ output[name] = block.call(extraction_details) |
|
| 279 |
+ } |
|
| 280 |
+ end |
|
| 281 |
+ |
|
| 282 |
+ def extract_json(doc) |
|
| 283 |
+ extract_each(doc) { |extraction_details|
|
|
| 284 |
+ result = Utils.values_at(doc, extraction_details['path']) |
|
| 285 |
+ log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
|
|
| 286 |
+ result |
|
| 287 |
+ } |
|
| 288 |
+ end |
|
| 289 |
+ |
|
| 290 |
+ def extract_text(doc) |
|
| 291 |
+ extract_each(doc) { |extraction_details|
|
|
| 292 |
+ regexp = Regexp.new(extraction_details['regexp']) |
|
| 293 |
+ result = [] |
|
| 294 |
+ doc.scan(regexp) {
|
|
| 295 |
+ result << Regexp.last_match[extraction_details['index']] |
|
| 296 |
+ } |
|
| 297 |
+ log "Extracting #{extraction_type} at #{regexp}: #{result}"
|
|
| 298 |
+ result |
|
| 299 |
+ } |
|
| 300 |
+ end |
|
| 301 |
+ |
|
| 302 |
+ def extract_xml(doc) |
|
| 303 |
+ extract_each(doc) { |extraction_details|
|
|
| 304 |
+ case |
|
| 305 |
+ when css = extraction_details['css'] |
|
| 306 |
+ nodes = doc.css(css) |
|
| 307 |
+ when xpath = extraction_details['xpath'] |
|
| 308 |
+ doc.remove_namespaces! # ignore xmlns, useful when parsing atom feeds |
|
| 309 |
+ nodes = doc.xpath(xpath) |
|
| 310 |
+ else |
|
| 311 |
+ raise '"css" or "xpath" is required for HTML or XML extraction' |
|
| 312 |
+ end |
|
| 313 |
+ case nodes |
|
| 314 |
+ when Nokogiri::XML::NodeSet |
|
| 315 |
+ result = nodes.map { |node|
|
|
| 316 |
+ case value = node.xpath(extraction_details['value']) |
|
| 317 |
+ when Float |
|
| 318 |
+ # Node#xpath() returns any numeric value as float; |
|
| 319 |
+ # convert it to integer as appropriate. |
|
| 320 |
+ value = value.to_i if value.to_i == value |
|
| 321 |
+ end |
|
| 322 |
+ value.to_s |
|
| 323 |
+ } |
|
| 324 |
+ else |
|
| 325 |
+ raise "The result of HTML/XML extraction was not a NodeSet" |
|
| 326 |
+ end |
|
| 327 |
+ log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
|
|
| 328 |
+ result |
|
| 329 |
+ } |
|
| 330 |
+ end |
|
| 331 |
+ |
|
| 308 | 332 |
def parse(data) |
| 309 | 333 |
case extraction_type |
| 310 |
- when "xml" |
|
| 311 |
- Nokogiri::XML(data) |
|
| 312 |
- when "json" |
|
| 313 |
- JSON.parse(data) |
|
| 314 |
- when "html" |
|
| 315 |
- Nokogiri::HTML(data) |
|
| 316 |
- when "text" |
|
| 317 |
- data |
|
| 318 |
- else |
|
| 319 |
- raise "Unknown extraction type #{extraction_type}"
|
|
| 334 |
+ when "xml" |
|
| 335 |
+ Nokogiri::XML(data) |
|
| 336 |
+ when "json" |
|
| 337 |
+ JSON.parse(data) |
|
| 338 |
+ when "html" |
|
| 339 |
+ Nokogiri::HTML(data) |
|
| 340 |
+ when "text" |
|
| 341 |
+ data |
|
| 342 |
+ else |
|
| 343 |
+ raise "Unknown extraction type #{extraction_type}"
|
|
| 320 | 344 |
end |
| 321 | 345 |
end |
| 322 | 346 |
|
| 323 | 347 |
def is_positive_integer?(value) |
| 324 |
- begin |
|
| 325 |
- Integer(value) >= 0 |
|
| 326 |
- rescue |
|
| 327 |
- false |
|
| 328 |
- end |
|
| 348 |
+ Integer(value) >= 0 |
|
| 349 |
+ rescue |
|
| 350 |
+ false |
|
| 329 | 351 |
end |
| 330 | 352 |
end |
| 331 | 353 |
end |